<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 5.4.2">
  <link rel="apple-touch-icon" sizes="180x180" href="/images/avatar.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/avatar.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/avatar.png">
  <link rel="mask-icon" href="/images/avatar.png" color="#222">

<link rel="stylesheet" href="/css/main.css">


<link rel="stylesheet" href="/lib/font-awesome/css/all.min.css">
  <link rel="stylesheet" href="//cdn.jsdelivr.net/gh/fancyapps/fancybox@3/dist/jquery.fancybox.min.css">

<script id="hexo-configurations">
    var NexT = window.NexT || {};
    var CONFIG = {"hostname":"notes.maxwi.com","root":"/","scheme":"Mist","version":"7.8.0","exturl":false,"sidebar":{"position":"right","display":"post","padding":18,"offset":12,"onmobile":false},"copycode":{"enable":false,"show_result":false,"style":null},"back2top":{"enable":true,"sidebar":false,"scrollpercent":false},"bookmark":{"enable":true,"color":"#222","save":"auto"},"fancybox":true,"mediumzoom":false,"lazyload":true,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":true,"nav":null},"algolia":{"hits":{"per_page":10},"labels":{"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}},"localsearch":{"enable":true,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false},"motion":{"enable":false,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},"path":"search.xml"};
  </script>

  <meta name="description" content="简介线程块中线程总数的大小除了受到硬件中Max Threads Per block的限制，同时还要受到Streaming Multiprocessor、Register和Shared Memory的影响。这些条件的共同作用下可以获得一个相对更合适的block尺寸。当block尺寸太小时，将无法充分利用所有线程；当block尺寸太大时，如果线程需要的资源总和过多，CUDA将通过强制减少block数量">
<meta property="og:type" content="article">
<meta property="og:title" content="Streaming MultiProcessor、Register、Shared-Memory对CUDA线程块尺寸的影响">
<meta property="og:url" content="http://notes.maxwi.com/2015/08/12/CUDA-Determine-block-size/index.html">
<meta property="og:site_name" content="blueyi&#39;s notes">
<meta property="og:description" content="简介线程块中线程总数的大小除了受到硬件中Max Threads Per block的限制，同时还要受到Streaming Multiprocessor、Register和Shared Memory的影响。这些条件的共同作用下可以获得一个相对更合适的block尺寸。当block尺寸太小时，将无法充分利用所有线程；当block尺寸太大时，如果线程需要的资源总和过多，CUDA将通过强制减少block数量">
<meta property="og:locale" content="en_US">
<meta property="og:image" content="http://notes.maxwi.com/2015/08/12/CUDA-Determine-block-size/1.png">
<meta property="og:image" content="http://notes.maxwi.com/2015/08/12/CUDA-Determine-block-size/2.png">
<meta property="og:image" content="http://notes.maxwi.com/2015/08/12/CUDA-Determine-block-size/3.png">
<meta property="og:image" content="http://notes.maxwi.com/2015/08/12/CUDA-Determine-block-size/4.png">
<meta property="og:image" content="http://notes.maxwi.com/2015/08/12/CUDA-Determine-block-size/5.png">
<meta property="article:published_time" content="2015-08-12T02:46:17.000Z">
<meta property="article:modified_time" content="2015-08-12T02:46:17.000Z">
<meta property="article:author" content="blueyi">
<meta property="article:tag" content="CUDA">
<meta property="article:tag" content="Notes">
<meta property="article:tag" content="Top">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="http://notes.maxwi.com/2015/08/12/CUDA-Determine-block-size/1.png">

<link rel="canonical" href="http://notes.maxwi.com/2015/08/12/CUDA-Determine-block-size/">


<script id="page-configurations">
  // https://hexo.io/docs/variables.html
  CONFIG.page = {
    sidebar: "",
    isHome : false,
    isPost : true,
    lang   : 'en'
  };
</script>

  <title>Streaming MultiProcessor、Register、Shared-Memory对CUDA线程块尺寸的影响 | blueyi's notes</title>
  






  <noscript>
  <style>
  .use-motion .brand,
  .use-motion .menu-item,
  .sidebar-inner,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-header { opacity: initial; }

  .use-motion .site-title,
  .use-motion .site-subtitle {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line-before i { left: initial; }
  .use-motion .logo-line-after i { right: initial; }
  </style>
</noscript>

<link rel="alternate" href="/atom.xml" title="blueyi's notes" type="application/atom+xml">
<link rel="alternate" href="/rss2.xml" title="blueyi's notes" type="application/rss+xml">
</head>

<body itemscope itemtype="http://schema.org/WebPage">
  <div class="container">
    <div class="headband"></div>

    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="Toggle navigation bar">
      <span class="toggle-line toggle-line-first"></span>
      <span class="toggle-line toggle-line-middle"></span>
      <span class="toggle-line toggle-line-last"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/" class="brand" rel="start">
      <span class="logo-line-before"><i></i></span>
      <h1 class="site-title">blueyi's notes</h1>
      <span class="logo-line-after"><i></i></span>
    </a>
      <p class="site-subtitle" itemprop="description">Follow Excellence,Success will chase you!</p>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
        <i class="fa fa-search fa-fw fa-lg"></i>
    </div>
  </div>
</div>




<nav class="site-nav">
  <ul id="menu" class="main-menu menu">
        <li class="menu-item menu-item-home">

    <a href="/" rel="section"><i class="fa fa-home fa-fw"></i>Home</a>

  </li>
        <li class="menu-item menu-item-categories">

    <a href="/categories/" rel="section"><i class="fa fa-th fa-fw"></i>Categories</a>

  </li>
        <li class="menu-item menu-item-archives">

    <a href="/archives/" rel="section"><i class="fa fa-archive fa-fw"></i>Archives</a>

  </li>
        <li class="menu-item menu-item-tags">

    <a href="/tags/" rel="section"><i class="fa fa-tags fa-fw"></i>Tags</a>

  </li>
        <li class="menu-item menu-item-about">

    <a href="/about/" rel="section"><i class="fa fa-user fa-fw"></i>About</a>

  </li>
      <li class="menu-item menu-item-search">
        <a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>Search
        </a>
      </li>
  </ul>
</nav>



  <div class="search-pop-overlay">
    <div class="popup search-popup">
        <div class="search-header">
  <span class="search-icon">
    <i class="fa fa-search"></i>
  </span>
  <div class="search-input-container">
    <input autocomplete="off" autocapitalize="off"
           placeholder="Searching..." spellcheck="false"
           type="search" class="search-input">
  </div>
  <span class="popup-btn-close">
    <i class="fa fa-times-circle"></i>
  </span>
</div>
<div id="search-result">
  <div id="no-result">
    <i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>
  </div>
</div>

    </div>
  </div>

</div>
    </header>

    
  <div class="back-to-top">
    <i class="fa fa-arrow-up"></i>
    <span>0%</span>
  </div>
  <div class="reading-progress-bar"></div>
  <a role="button" class="book-mark-link book-mark-link-fixed"></a>


    <main class="main">
      <div class="main-inner">
        <div class="content-wrap">
          

          <div class="content post posts-expand">
            

    
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-block" lang="en">
    <link itemprop="mainEntityOfPage" href="http://notes.maxwi.com/2015/08/12/CUDA-Determine-block-size/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/default_avatar.jpg">
      <meta itemprop="name" content="blueyi">
      <meta itemprop="description" content="心怀善意，虛怀若谷！">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="blueyi's notes">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          Streaming MultiProcessor、Register、Shared-Memory对CUDA线程块尺寸的影响
        </h1>

        <div class="post-meta">
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="far fa-calendar"></i>
              </span>
              <span class="post-meta-item-text">Posted on</span>

              <time title="Created: 2015-08-12 10:46:17" itemprop="dateCreated datePublished" datetime="2015-08-12T10:46:17+08:00">2015-08-12</time>
            </span>
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="far fa-folder"></i>
              </span>
              <span class="post-meta-item-text">In</span>
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/CUDA/" itemprop="url" rel="index"><span itemprop="name">CUDA</span></a>
                </span>
            </span>

          <br>
            <span class="post-meta-item" title="Symbols count in article">
              <span class="post-meta-item-icon">
                <i class="far fa-file-word"></i>
              </span>
                <span class="post-meta-item-text">Symbols count in article: </span>
              <span>5.7k</span>
            </span>
            <span class="post-meta-item" title="Reading time">
              <span class="post-meta-item-icon">
                <i class="far fa-clock"></i>
              </span>
                <span class="post-meta-item-text">Reading time &asymp;</span>
              <span>5 mins.</span>
            </span>

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">

      
        <h2 id="简介"><a href="#简介" class="headerlink" title="简介"></a>简介</h2><p>线程块中线程总数的大小除了受到硬件中<code>Max Threads Per block</code>的限制，同时还要受到<code>Streaming Multiprocessor、Register</code>和<code>Shared Memory</code>的影响。这些条件的共同作用下可以获得一个相对更合适的<code>block</code>尺寸。当<code>block</code>尺寸太小时，将无法充分利用所有线程；当<code>block</code>尺寸太大时，如果线程需要的资源总和过多，<code>CUDA</code>将通过强制减少<code>block</code>数量来保证资源供应，同样无法利用所有线程。而<code>grid</code>的尺寸通常越大越好，当<code>grid</code>中的线程总数超过一次所能启动的并发线程总数时，过多的线程将以线程块为单位由CUDA进行新的调用，当然启动数量够用就可以了，以免浪费资源。</p>
<span id="more"></span>

<p>但具体最多我们可以定义的grid尺寸是多大跟计算能力倒是关系不大，所以你会发现消费级GTX970与专业级Tesla P4的计算能力分别是5.2和6.1，但可以定义的grid尺寸却相同。在我的测试中，同一段代码，一维网格和block的情况下，Block尺寸都为1024时，GTX1080的grid在win10系统桌面由显卡驱动的情况下，最多可以是78（WIN10系统。而linux下的Tesla P4和GTX970的grid最大值都是680。<br>然而二维grid和block的时候，block尺寸虽然有1024的限制，但grid尺寸可以大到你的int放不下，当然我们并没有这么多的流处理器在运算，只是过多的block在排着队等着被执行。<br>而CUDA的调度中为了隐藏内存访问带来的latency，一个block中的线程数量越多越好（确切来说是wrap数量）。</p>
<p>本文需要通过<code>NVIDIA</code>官方提供的一个非常有用的工具<code>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0\tools\CUDA_Occupancy_Calculator.xls</code>进行辅助计算来获得合适的<code>block</code>大小。这个<code>excle</code>表中的<code>Help</code>表单有详细的使用说明，以下分析将基于这个工具。<br>可以通过在编译时添加<code>--ptxas-options=-v</code>参数在编译时输出当前程序中核心函数所使用的<code>Register</code>、<code>Shared Memory</code>等大小，或者使用<code>nvcc --resource-usage kernel.cu</code>来获取，注意此处获取到的值只是nvcc在编译期为核函数分配的相应空间，如果通过动态的方式分配过shared memory，则需要单独加上你手动动态分配的shared memory才是总是shared memory。<br>输出的ptxas info可能包含的信息表示如下：<br><code>registers</code>: 寄存器<br><code>lmem</code>: <code>local memory</code> 局部内存<br><code>smem</code>: <code>shared memory</code> 共享内存<br><code>cmem</code>: <code>constant memory</code> 常量内存<br>其中<code>cmem</code>又可以分以一下一些情况（不全）：<br><code>cmem[0]</code> kernel arguments<br><code>cmem[2]</code> user defined <strong>constant</strong> objects<br><code>cmem[16]</code> compiler generated constants (some of which may correspond to literal constants in the source code)</p>
<p>参考：<a target="_blank" rel="noopener" href="https://devtalk.nvidia.com/default/topic/493425/ptxas-info-unexplained-what-is-cmem-n-/">https://devtalk.nvidia.com/default/topic/493425/ptxas-info-unexplained-what-is-cmem-n-/</a></p>
<p>显卡信息可以由官方示例中的<code>deviceQuery</code>得出，当然也可以查官方说明。<br>看着这个表挺复杂，其实只需要记住我们所要设定的线程块尺寸只要能够保证<strong>SM满占</strong>即可。</p>
<h2 id="SM的限制对block尺寸的影响"><a href="#SM的限制对block尺寸的影响" class="headerlink" title="SM的限制对block尺寸的影响"></a>SM的限制对block尺寸的影响</h2><p><code>Multiprocessor</code>(以下所有的<code>Multiprocessor</code>都是指<code>Streaming Multiprocessor</code>，即SM,具体SM在GPU中的结构组织参见<a href="http://notes.maxwi.com/2015/06/11/CUDA-study-notes/">http://notes.maxwi.com/2015/06/11/CUDA-study-notes/</a>基本概念中的图示)中<code>thread</code>数量的限制主要影响工作状态的线程是否能够占满当前的整个<code>Multiprocessor</code>。为了使工作状态的线程能够占满整个<code>Multiprocessor</code>，<code>block</code>中的<code>thread</code>的总数应当不小于<code>Max Threads Per Multiprocessor / Max Thread Blocks Per Multiprocessor</code>，对于该表，也就是使<code>Occupancy of each Multiprocessor</code>为<code>100%</code>即可。<code>Max Thread Blocks Per Multiprocessor</code>无法由<code>deviceQuery</code>得到，表中会给出，其实也很容易计算得到<code>Max Threads Per Multiprocessor / 32</code>（因为一个block一次最小也要启动一个wrap，即32个线程），同一计算能力的显卡这些基本信息都是一样的。</p>
<p>下图所示为CUDA_Occupancy_Calculator给出的计算能力为5.0的GPU的参数。<br><img data-src="1.png" alt="Physical Limits for GPU Compute Capability">  </p>
<p>此时线程块的中线程数量应当不小于2048/32 = 64。当线程块的数量小于64时，由于每个Multiprocessor中可以含有的最大线程块只有32个，所以此时32个线程块乘以这个小于64的数字必然要小于2048，也就是无论怎样都无法使线程填满整个Multiprocessor，导致SM中会有空闲的Streaming Processor。所以根据要填满SM，则要求每个线程块中至少需要有64个线程。如果将线程块尺寸设置为1024，则此时根据SM总线程的限制2048，每个SM正好可以启动2个线程块，如果我们每个线程块上的资源都够用，那这就是一个最佳尺寸。然而通常资源是最大限制，所以下面借助官方工具来综合考虑资源占用下的线程块尺寸。</p>
<p>注意：1). 当前计算能力5.0的设备每个线程块中线程数量上限Maximum Thread Block Size=1024、2). CUDA中线程组织单位为Wrap，此处Threads per Warp=32，所以线程块中的线程数量应为32的倍数。3). 各维都有大小限制，计算能力5.0的三维分别为1024,1024,64</p>
<h2 id="CUDA-Occupancy-Calculator的使用"><a href="#CUDA-Occupancy-Calculator的使用" class="headerlink" title="CUDA_Occupancy_Calculator的使用"></a>CUDA_Occupancy_Calculator的使用</h2><p>下图为<code>CUDA_Occupancy_Calculator</code>计算结果，由于此处只讨论了SM中线程及线程块的限制对<code>block</code>尺寸的影响，所以这里只需要使曲线的红三角在波峰即可。注意第一张图中的1.)、1.b)需要根据自己的硬件情况进行选择；2).需要自己填入的程序参数(可以通过2.2的方法获得)，3).工具计算出的GPU使用率，显示此时占用为100%，也就是SM中的活动线程束为64个，即64*32=2048，SM处于满占状态。其他各图是其对应的曲线，由第二幅图可以看出在<code>register</code>和<code>shared memory</code>固定的情况下，block尺寸设置为256, 512, 1024时都可以占满SM，block尺寸越小，同一个block中可使用的Shared Memory和每个线程中可使用Register就越大，因为同一个SM中的block共享这些资源。。它们都有以下特点：1). 大于64，满足不小于<code>Max Threads Per Multiprocessor / Max Thread Blocks Per Multiprocessor</code>; 2). 32的整数倍，满足线程束的最小单位Wrap; 3). 总线程数可以被2048整除，因为<code>Max Threads Per Multiprocessor=2048</code>。<br><img data-src="2.png" alt="CUDA Occupancy Calculator"><br><img data-src="3.png" alt="Impact of Varying Block Size"><br><img data-src="4.png" alt="Impact of Varying Shared Memory Usage Per Block"><br><img data-src="5.png" alt="Impact of Varying Register Count Per Thread ">  </p>
<h2 id="Register对block尺寸的影响"><a href="#Register对block尺寸的影响" class="headerlink" title="Register对block尺寸的影响"></a>Register对block尺寸的影响</h2><p>计算能力为5.0的设备<code>Registers per Multiprocessor=65536，Max Registers per Thread=256</code>。也就是说一个SM总共也就只有65536个<code>register</code>，一个<code>thread</code>最多能定义256大小的<code>register</code>，根据上图可以看出，显然当<code>Register Per Thread</code>大于32时性能就要开始降低了。因为当前情况下SM中的所有线程都被占满了(此处占满意思是所有线程都为活动线程)，也就是说在这种block参数配置下，一个SM最多可以启动2048个线程(注意是最多可以，并不是说一定要，比如我一共就1000个数据，当然就启动1000个线程就可以了)，由于SM中能使用的register最多只有65536，当SM中的资源不够用时，SM就会强制减少block，所以<code>Register Per Thread</code>应该不大于<code>Registers per Multiprocessor / Max Threads per Multiprocessor</code>，也就是65536 / 2048 = 32，如何降低register占用是个很难的调整阶段，只能慢慢多调</p>
<h2 id="Shared-Memory对block尺寸的影响"><a href="#Shared-Memory对block尺寸的影响" class="headerlink" title="Shared Memory对block尺寸的影响"></a>Shared Memory对block尺寸的影响</h2><p>计算能力为5.0的设备<code>Shared Memory per Multiprocessor (bytes)=65536，Max Shared Memory per Block=49152</code>。也就是说block中的<code>smem(Shared Memory)</code>必须要小于49152，要想使得SM中的线程全部占满，那么整个SM中占用的smem必须小于65536。由于smem是以block为单位进行分配，所以当smem不够用时也就会以block为单位进行减少线程。所以<code>Shared Memory Per Block (bytes)</code>应该不大于<code>Shared Memory per Multiprocessor / Active Thread Blocks per Multiprocessor = Shared Memory per Multiprocessor / (Max Threads per Multiprocessor / Threads Per Block)</code>，当block尺寸设置为256时，<code>Shared Memory Per Block (bytes)</code>最大值为65536 / (2048/256) = 8192，单位是bytes。</p>
<h2 id="总结"><a href="#总结" class="headerlink" title="总结"></a>总结</h2><p>block最佳尺寸应该满足：<br>1). 不小于<code>Max Threads Per Multiprocessor / Max Thread Blocks Per Multiprocessor</code>；<br>2). 32的整数倍；<br>3). 可以被<code>Max Threads Per Multiprocessor</code>整除。<br>4). <code>Register Per Thread</code>应该不大于<code>Registers per Multiprocessor / Max Threads per Multiprocessor</code>，否则根据<code>CUDA_Occupancy_Calculator.xls</code>参考调节block尺寸以获得最佳性能。  </p>
<p>5). <code>Shared Memory Per Block (bytes)</code>最大值应该不大于<code>Shared Memory per Multiprocessor / (Max Threads per Multiprocessor / Threads Per Block)</code>，否则根据<code>CUDA_Occupancy_Calculator.xls</code>参考调节block尺寸以获得最佳性能。<br><strong>一句话总结：保证每个SM中可以启动的线程总数达到最大值的情况下block中的线程数越大越好。</strong></p>
<p>如果原理都懂了，也可以直接使用CUDA 6.5之后带的一个API，自动帮你计算Grid尺寸和block尺寸：<code>cudaOccupancyMaxPotentialBlockSize()</code>，该API在<code>cuda_runtime.h</code>头文件中，必须采用NVCC编译才能使用，其定义如下：</p>
<figure class="highlight cpp"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">template</span>&lt;<span class="keyword">class</span> T&gt;</span></span><br><span class="line"><span class="function">__inline__ __host__ CUDART_DEVICE cudaError_t <span class="title">cudaOccupancyMaxPotentialBlockSize</span><span class="params">(</span></span></span><br><span class="line"><span class="params"><span class="function">    <span class="type">int</span>    *minGridSize,</span></span></span><br><span class="line"><span class="params"><span class="function">    <span class="type">int</span>    *blockSize,</span></span></span><br><span class="line"><span class="params"><span class="function">    T       func,</span></span></span><br><span class="line"><span class="params"><span class="function">    <span class="type">size_t</span>  dynamicSMemSize = <span class="number">0</span>,</span></span></span><br><span class="line"><span class="params"><span class="function">    <span class="type">int</span>     blockSizeLimit = <span class="number">0</span>)</span></span></span><br><span class="line"><span class="function"></span>&#123;</span><br><span class="line">    <span class="keyword">return</span> <span class="built_in">cudaOccupancyMaxPotentialBlockSizeVariableSMem</span>(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit);</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>
<p>参数意义：</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">minGridSize     = Suggested min grid size to achieve a full machine launch.</span><br><span class="line">blockSize       = Suggested block size to achieve maximum occupancy.</span><br><span class="line">func            = Kernel function.</span><br><span class="line">dynamicSMemSize = Size of dynamically allocated shared memory. Of course, it is known at runtime before any kernel launch. The size of the statically allocated shared memory is not needed as it is inferred by the properties of func.</span><br></pre></td></tr></table></figure>
<p>参考：<a target="_blank" rel="noopener" href="https://stackoverflow.com/questions/9985912/how-do-i-choose-grid-and-block-dimensions-for-cuda-kernels">https://stackoverflow.com/questions/9985912/how-do-i-choose-grid-and-block-dimensions-for-cuda-kernels</a></p>
<p>注意：  </p>
<ol>
<li>没有标明单位的量其单位都为个，例如register。  </li>
<li><code>CUDA_Occupancy_Calculator</code>给出了<code>register</code>和<code>shared memory</code>的分配单位，精确优化时应该非常有用。<br>Register allocation unit size        256<br>Register allocation granularity        warp<br>Shared Memory allocation unit size  256<br>Warp allocation granularity         4   </li>
</ol>

    </div>

    
    
    
        

  <div class="followme">
    <p>Welcome to my other publishing channels</p>

    <div class="social-list">

        <div class="social-item">
          <a target="_blank" class="social-link" href="/atom.xml">
            <span class="icon">
              <i class="fa fa-rss"></i>
            </span>

            <span class="label">RSS</span>
          </a>
        </div>
    </div>
  </div>


      <footer class="post-footer">
          <div class="post-tags">
              <a href="/tags/CUDA/" rel="tag"># CUDA</a>
              <a href="/tags/Notes/" rel="tag"># Notes</a>
              <a href="/tags/Top/" rel="tag"># Top</a>
          </div>

        


        
    <div class="post-nav">
      <div class="post-nav-item">
    <a href="/2015/08/12/Determine-block-size/" rel="prev" title="Streaming MultiProcessor、Register、Shared-Memory对线程块尺寸的影响">
      <i class="fa fa-chevron-left"></i> Streaming MultiProcessor、Register、Shared-Memory对线程块尺寸的影响
    </a></div>
      <div class="post-nav-item">
    <a href="/2015/09/11/C-sharp-study-notes/" rel="next" title="C-sharp-study-notes">
      C-sharp-study-notes <i class="fa fa-chevron-right"></i>
    </a></div>
    </div>
      </footer>
    
  </article>
  
  
  



          </div>
          
    <div class="comments" id="gitalk-container"></div>

<script>
  window.addEventListener('tabs:register', () => {
    let { activeClass } = CONFIG.comments;
    if (CONFIG.comments.storage) {
      activeClass = localStorage.getItem('comments_active') || activeClass;
    }
    if (activeClass) {
      let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
      if (activeTab) {
        activeTab.click();
      }
    }
  });
  if (CONFIG.comments.storage) {
    window.addEventListener('tabs:click', event => {
      if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
      let commentClass = event.target.classList[1];
      localStorage.setItem('comments_active', commentClass);
    });
  }
</script>

        </div>
          
  
  <div class="toggle sidebar-toggle">
    <span class="toggle-line toggle-line-first"></span>
    <span class="toggle-line toggle-line-middle"></span>
    <span class="toggle-line toggle-line-last"></span>
  </div>

  <aside class="sidebar">
    <div class="sidebar-inner">

      <ul class="sidebar-nav motion-element">
        <li class="sidebar-nav-toc">
          Table of Contents
        </li>
        <li class="sidebar-nav-overview">
          Overview
        </li>
      </ul>

      <!--noindex-->
      <div class="post-toc-wrap sidebar-panel">
          <div class="post-toc motion-element"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#%E7%AE%80%E4%BB%8B"><span class="nav-number">1.</span> <span class="nav-text">简介</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#SM%E7%9A%84%E9%99%90%E5%88%B6%E5%AF%B9block%E5%B0%BA%E5%AF%B8%E7%9A%84%E5%BD%B1%E5%93%8D"><span class="nav-number">2.</span> <span class="nav-text">SM的限制对block尺寸的影响</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#CUDA-Occupancy-Calculator%E7%9A%84%E4%BD%BF%E7%94%A8"><span class="nav-number">3.</span> <span class="nav-text">CUDA_Occupancy_Calculator的使用</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Register%E5%AF%B9block%E5%B0%BA%E5%AF%B8%E7%9A%84%E5%BD%B1%E5%93%8D"><span class="nav-number">4.</span> <span class="nav-text">Register对block尺寸的影响</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Shared-Memory%E5%AF%B9block%E5%B0%BA%E5%AF%B8%E7%9A%84%E5%BD%B1%E5%93%8D"><span class="nav-number">5.</span> <span class="nav-text">Shared Memory对block尺寸的影响</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E6%80%BB%E7%BB%93"><span class="nav-number">6.</span> <span class="nav-text">总结</span></a></li></ol></div>
      </div>
      <!--/noindex-->

      <div class="site-overview-wrap sidebar-panel">
        <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
    <img class="site-author-image" itemprop="image" alt="blueyi"
      src="/images/default_avatar.jpg">
  <p class="site-author-name" itemprop="name">blueyi</p>
  <div class="site-description" itemprop="description">心怀善意，虛怀若谷！</div>
</div>
<div class="site-state-wrap motion-element">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/archives/">
        
          <span class="site-state-item-count">104</span>
          <span class="site-state-item-name">posts</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
            <a href="/categories/">
          
        <span class="site-state-item-count">26</span>
        <span class="site-state-item-name">categories</span></a>
      </div>
      <div class="site-state-item site-state-tags">
            <a href="/tags/">
          
        <span class="site-state-item-count">68</span>
        <span class="site-state-item-name">tags</span></a>
      </div>
  </nav>
</div>
  <div class="links-of-author motion-element">
      <span class="links-of-author-item">
        <a href="https://github.com/blueyi" title="GitHub → https:&#x2F;&#x2F;github.com&#x2F;blueyi" rel="noopener" target="_blank"><i class="fab fa-github fa-fw"></i>GitHub</a>
      </span>
  </div>


  <div class="links-of-blogroll motion-element">
    <div class="links-of-blogroll-title"><i class="fa fa-link fa-fw"></i>
      Links
    </div>
    <ul class="links-of-blogroll-list">
        <li class="links-of-blogroll-item">
          <a href="http://maxwi.com/" title="http:&#x2F;&#x2F;maxwi.com" rel="noopener" target="_blank">Maxwi</a>
        </li>
    </ul>
  </div>

      </div>

    </div>
  </aside>
  <div id="sidebar-dimmer"></div>


      </div>
    </main>

    <footer class="footer">
      <div class="footer-inner">
        

        

<div class="copyright">
  
  &copy; 2014 – 
  <span itemprop="copyrightYear">2022</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">blueyi</span>
    <span class="post-meta-divider">|</span>
    <span class="post-meta-item-icon">
      <i class="fa fa-chart-area"></i>
    </span>
    <span title="Symbols count total">750k</span>
    <span class="post-meta-divider">|</span>
    <span class="post-meta-item-icon">
      <i class="fa fa-coffee"></i>
    </span>
    <span title="Reading time total">11:22</span>
</div>
  <div class="powered-by">Powered by <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> & <a href="https://mist.theme-next.org/" class="theme-link" rel="noopener" target="_blank">NexT.Mist</a>
  </div>

        








      </div>
    </footer>
  </div>

  
  <script src="/lib/anime.min.js"></script>
  <script src="/lib/pjax/pjax.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/jquery@3/dist/jquery.min.js"></script>
  <script src="//cdn.jsdelivr.net/gh/fancyapps/fancybox@3/dist/jquery.fancybox.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/lozad@1/dist/lozad.min.js"></script>

<script src="/js/utils.js"></script>


<script src="/js/schemes/muse.js"></script>


<script src="/js/next-boot.js"></script>

<script src="/js/bookmark.js"></script>

  <script>
var pjax = new Pjax({
  selectors: [
    'head title',
    '#page-configurations',
    '.content-wrap',
    '.post-toc-wrap',
    '.languages',
    '#pjax'
  ],
  switches: {
    '.post-toc-wrap': Pjax.switches.innerHTML
  },
  analytics: false,
  cacheBust: false,
  scrollTo : !CONFIG.bookmark.enable
});

window.addEventListener('pjax:success', () => {
  document.querySelectorAll('script[data-pjax], script#page-configurations, #pjax script').forEach(element => {
    var code = element.text || element.textContent || element.innerHTML || '';
    var parent = element.parentNode;
    parent.removeChild(element);
    var script = document.createElement('script');
    if (element.id) {
      script.id = element.id;
    }
    if (element.className) {
      script.className = element.className;
    }
    if (element.type) {
      script.type = element.type;
    }
    if (element.src) {
      script.src = element.src;
      // Force synchronous loading of peripheral JS.
      script.async = false;
    }
    if (element.dataset.pjax !== undefined) {
      script.dataset.pjax = '';
    }
    if (code !== '') {
      script.appendChild(document.createTextNode(code));
    }
    parent.appendChild(script);
  });
  NexT.boot.refresh();
  // Define Motion Sequence & Bootstrap Motion.
  if (CONFIG.motion.enable) {
    NexT.motion.integrator
      .init()
      .add(NexT.motion.middleWares.subMenu)
      .add(NexT.motion.middleWares.postList)
      .bootstrap();
  }
  NexT.utils.updateSidebarPosition();
});
</script>




  
  <script data-pjax>
    (function(){
      var canonicalURL, curProtocol;
      //Get the <link> tag
      var x=document.getElementsByTagName("link");
		//Find the last canonical URL
		if(x.length > 0){
			for (i=0;i<x.length;i++){
				if(x[i].rel.toLowerCase() == 'canonical' && x[i].href){
					canonicalURL=x[i].href;
				}
			}
		}
    //Get protocol
	    if (!canonicalURL){
	    	curProtocol = window.location.protocol.split(':')[0];
	    }
	    else{
	    	curProtocol = canonicalURL.split(':')[0];
	    }
      //Get current URL if the canonical URL does not exist
	    if (!canonicalURL) canonicalURL = window.location.href;
	    //Assign script content. Replace current URL with the canonical URL
      !function(){var e=/([http|https]:\/\/[a-zA-Z0-9\_\.]+\.baidu\.com)/gi,r=canonicalURL,t=document.referrer;if(!e.test(r)){var n=(String(curProtocol).toLowerCase() === 'https')?"https://sp0.baidu.com/9_Q4simg2RQJ8t7jm9iCKT-xh_/s.gif":"//api.share.baidu.com/s.gif";t?(n+="?r="+encodeURIComponent(document.referrer),r&&(n+="&l="+r)):r&&(n+="?l="+r);var i=new Image;i.src=n}}(window);})();
  </script>




  
<script src="/js/local-search.js"></script>













    <div id="pjax">
  

  

<link rel="stylesheet" href="//cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.min.css">

<script>
NexT.utils.loadComments(document.querySelector('#gitalk-container'), () => {
  NexT.utils.getScript('//cdn.jsdelivr.net/npm/gitalk@1/dist/gitalk.min.js', () => {
    var gitalk = new Gitalk({
      clientID    : '0f8243eb2c8b2207980f',
      clientSecret: 'd159633a33519d3b7a48b0ca729032f7d1f38a41',
      repo        : 'notes',
      owner       : 'blueyi',
      admin       : ['blueyi'],
      id          : '622c33f79bf41c5b3eaa18bcb6ea8a04',
        language: '',
      distractionFreeMode: true
    });
    gitalk.render('gitalk-container');
  }, window.Gitalk);
});
</script>

    </div>
</body>
</html>
